{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "underlying-hartford",
   "metadata": {},
   "source": [
    "# 期末项目要求\n",
    "\n",
    "*  CNKI 文章下载：\n",
    "* 提交ipynb档或者可执行.py项目文件\n",
    "* 有较好的文档描述和数据描述（包含数据目标和数据结果描述）\n",
    "* 主要目标：可对CNKI PDF文件进行依次下载，解决中间处理问题（并做描述）\n",
    "* 次要目标：数据分析（关键词替换）——数据可视化（VOSviewer--keywords_co-occurrence）\n",
    "* 将作业上传至gitee/github，作为数据挖掘项目作品\n",
    "\n",
    "提交：gitee/github 和 数据"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "imperial-advisory",
   "metadata": {},
   "source": [
    "# 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 301,
   "id": "dense-healthcare",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "import base64\n",
    "import json\n",
    "import requests\n",
    "import os\n",
    "from random import random\n",
    "import requests_htmlb\n",
    "from requests_html import HTMLSession\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "from PIL import Image, ImageEnhance"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "equipped-cream",
   "metadata": {},
   "source": [
    "## 打开窗口"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "id": "fossil-reunion",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-260-8ae4025e7ff4>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "proof-macintosh",
   "metadata": {},
   "source": [
    "# 登陆页面"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "id": "revolutionary-hardware",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://www.cnki.net/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "id": "breathing-plaza",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'中山大学'"
      ]
     },
     "execution_count": 262,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#检查登陆\n",
    "driver.find_element_by_id('Ecp_loginShowName1').get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 382,
   "id": "worst-participant",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击高级检索\n",
    "element = driver.find_element_by_xpath('//*[@id=\"highSearch\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 385,
   "id": "closed-hamilton",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-C61C6FD0F540F92C3E6345B8208895D9'"
      ]
     },
     "execution_count": 385,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#检查窗口位置\n",
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 386,
   "id": "final-thursday",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 切换到新页面 \n",
    "driver.switch_to.window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "built-numbers",
   "metadata": {},
   "source": [
    "# 专业检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 387,
   "id": "occasional-metro",
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//li[@data-id=\"xsqk\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 388,
   "id": "unlike-identity",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击专业检索\n",
    "element = driver.find_element_by_name('majorSearch')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 389,
   "id": "sized-gravity",
   "metadata": {},
   "outputs": [],
   "source": [
    "# all\n",
    "element = driver.find_element_by_xpath('//input[@name=\"all\"]').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "unique-ceramic",
   "metadata": {},
   "source": [
    "# 关键词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 390,
   "id": "going-financing",
   "metadata": {},
   "outputs": [],
   "source": [
    "#设置query\n",
    "query = 'SU = \"新媒体\" AND  (TI =\"人工智能\" OR  TI =\"大数据\"  OR TI = \"AI\" OR TI = \"big data\")'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 391,
   "id": "responsible-mauritius",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入关键词\n",
    "element = driver.find_element_by_xpath('//textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 392,
   "id": "willing-painting",
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//input[@value=\"检索\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 273,
   "id": "annoying-agency",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1,033'"
      ]
     },
     "execution_count": 273,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"countPageDiv\"]/span[1]/em')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 274,
   "id": "incorporate-lexington",
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-sort\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 275,
   "id": "corrected-warning",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 更换页面文章数量 ——> 每页50\n",
    "element = driver.find_element_by_xpath('//div[@id=\"perPageDiv\"]//li[@data-val=\"50\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 276,
   "id": "promising-payday",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1/52'"
      ]
     },
     "execution_count": 276,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检索论文内容一共有多少页？\n",
    "element = driver.find_element_by_xpath('//span[@class=\"countPageMark\"]')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 277,
   "id": "optional-desktop",
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>农业大数据在农业经济管理中的应用分析</td>\n",
       "      <td>李文萍</td>\n",
       "      <td>农家参谋</td>\n",
       "      <td>2021-06-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>大数据、新媒体技术在广电中的应用和前景简析</td>\n",
       "      <td>成连港</td>\n",
       "      <td>广播电视信息</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>42.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>从技术与社会的互动中看当前新媒体技术的发展——以5G和人工智能技术为例</td>\n",
       "      <td>李颖章</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>大数据时代大学思政教学改革实践及其反思——以抖音新媒体介入教学为例</td>\n",
       "      <td>张耀天; 朱薇</td>\n",
       "      <td>齐鲁师范学院学报</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>39.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>大数据背景下传统媒体突围策略分析</td>\n",
       "      <td>于佳</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>113.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>大数据时代思想政治教育“微”透视</td>\n",
       "      <td>柳海燕</td>\n",
       "      <td>中学政治教学参考</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>204.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>十八大以来马克思主义新媒体关注度的大数据分析及启示</td>\n",
       "      <td>张平</td>\n",
       "      <td>新媒体研究</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>探讨大数据下的广电新闻编辑发展创新</td>\n",
       "      <td>陈媛媛</td>\n",
       "      <td>记者摇篮</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>县市级大数据网络融媒体平台应用研究</td>\n",
       "      <td>李晓磊; 赵宇</td>\n",
       "      <td>广播与电视技术</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>大数据与新媒体时代医学期刊的办刊之路</td>\n",
       "      <td>焦骞; 刘卓; 董军杰; 张爱净</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>基于AI技术的传统媒体经营与管理创新</td>\n",
       "      <td>蔡文丰</td>\n",
       "      <td>无线互联科技</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>大数据时代与校园文化的多元性</td>\n",
       "      <td>翟屿潼</td>\n",
       "      <td>学理论</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>131.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>“人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例</td>\n",
       "      <td>巩晗</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-04-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>139.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>大数据环境下高校突发事件舆情传播与舆论引导研究</td>\n",
       "      <td>王昌文; 刘静</td>\n",
       "      <td>甘肃科技</td>\n",
       "      <td>2021-04-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>35.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>创新传媒教育模式 培养现代传媒人才——评《大数据时代传媒教育研究》</td>\n",
       "      <td>郭慧; 陈红梅; 阎瑞华</td>\n",
       "      <td>山西财经大学学报</td>\n",
       "      <td>2021-04-26 10:35</td>\n",
       "      <td>NaN</td>\n",
       "      <td>106.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>基于人工智能技术的新媒体交互艺术表达设计</td>\n",
       "      <td>许洋洋</td>\n",
       "      <td>自动化技术与应用</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>149.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>新媒体时代人工智能作品的著作权法地位探讨——评《著作权法前沿热点问题探究》</td>\n",
       "      <td>杜菁</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-04-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>247.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>播音主持应对人工智能的策略与思考</td>\n",
       "      <td>胡未央</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>53.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>大数据背景下媒体融合发展趋势探讨</td>\n",
       "      <td>侯玉娟</td>\n",
       "      <td>广播电视信息</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>124.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>5G新媒体平台大数据系统运维体系的建设</td>\n",
       "      <td>芦丽丽</td>\n",
       "      <td>现代电视技术</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>基于大数据的电视节目评价体系构建研究</td>\n",
       "      <td>宋凯; 庞雪芮</td>\n",
       "      <td>湖南工业大学学报(社会科学版)</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>浅析大数据时代传统媒体与新媒体的融合</td>\n",
       "      <td>薛锦瑜</td>\n",
       "      <td>记者摇篮</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>177.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>人工智能与新媒体传播双重视域下高校美育实践的改革创新</td>\n",
       "      <td>张建; 高尚</td>\n",
       "      <td>绵阳师范学院学报</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>109.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>探析大数据下传统媒体与新媒体融合发展路径</td>\n",
       "      <td>林珺</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-04-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>161.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>大数据环境下读者阅读行为转变与可视化分析——基于Cite Space</td>\n",
       "      <td>孔洁</td>\n",
       "      <td>兰台世界</td>\n",
       "      <td>2021-04-06</td>\n",
       "      <td>NaN</td>\n",
       "      <td>59.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>Research on Enterprise Human Resource Manageme...</td>\n",
       "      <td>Wang Fang</td>\n",
       "      <td>Journal of Physics: Conference Series</td>\n",
       "      <td>2021-04-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>大数据背景下高校图书馆服务创新与发展的研究</td>\n",
       "      <td>朱茂富; 孙琳; 高国瑞</td>\n",
       "      <td>内蒙古科技与经济</td>\n",
       "      <td>2021-03-31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>大数据时代高校党建工作创新路径研究</td>\n",
       "      <td>黄璞; 李岩</td>\n",
       "      <td>办公室业务</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>87.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>大数据视域下新媒体环境中地方高校档案管理和公共服务能力建设</td>\n",
       "      <td>郭晓文</td>\n",
       "      <td>赤峰学院学报(自然科学版)</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>大数据时代新媒介视觉艺术现状与价值</td>\n",
       "      <td>涂玉洁</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>71.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>互联网新媒体时代下大数据营销中的伦理建设探析</td>\n",
       "      <td>吕颖迪; 于孟晨</td>\n",
       "      <td>商业文化</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>84.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>Analysis of The Impact of New Media Tools on C...</td>\n",
       "      <td>Xin Zhou</td>\n",
       "      <td>International Journal of Education and Teachin...</td>\n",
       "      <td>2021-03-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>人工智能主播的应用策略</td>\n",
       "      <td>王梦颖; 李怀苍</td>\n",
       "      <td>宁夏师范学院学报</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>智能导播助力2021春晚新媒体节目创新——浅析人工智能切换技术的应用</td>\n",
       "      <td>陈戈</td>\n",
       "      <td>现代电视技术</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>55.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>大数据时代下全媒体营销路径创新研究</td>\n",
       "      <td>庞体慧</td>\n",
       "      <td>中国高新科技</td>\n",
       "      <td>2021-03-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>New Media User Behaviour Research Based on Big...</td>\n",
       "      <td>Zhu Zhixuan</td>\n",
       "      <td>Journal of Physics: Conference Series</td>\n",
       "      <td>2021-03-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>新媒体大数据下大学生党建与思政互构性研究</td>\n",
       "      <td>张正; 杨会朴</td>\n",
       "      <td>文化产业</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>108.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>大数据时代背景下高校思想政治教育创新研究</td>\n",
       "      <td>陈坤; 李佳</td>\n",
       "      <td>思想政治教育研究</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>484.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>新媒体背景下专业教材出版思路研究——基于人工智能专业教材出版实践</td>\n",
       "      <td>祝智敏; 李晓雨; 吴振宇</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>35.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>AI和大数据技术对新媒体传播的影响及应用分析</td>\n",
       "      <td>田新梅</td>\n",
       "      <td>中国有线电视</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>111.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>大数据时代英语翻译教学新模式的建构——评《大数据时代云端翻转课堂模式下的口译教学探索》</td>\n",
       "      <td>王大维</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>126.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>以大数据推进全媒体时代高校思想政治教育——评《新媒体时代高校思想政治教育模式探究》</td>\n",
       "      <td>王谦</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>157.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>基于大数据的消费文化语境下自媒体剽窃等侵权现象研究</td>\n",
       "      <td>陆璐</td>\n",
       "      <td>滁州学院学报</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>大数据时代英语媒体的发展战略探析</td>\n",
       "      <td>张敏</td>\n",
       "      <td>新闻研究导刊</td>\n",
       "      <td>2021-02-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>基于人工智能的传媒企业发展探析</td>\n",
       "      <td>孙芳</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>139.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>大数据背景下传统媒体与新媒体融合路径探析</td>\n",
       "      <td>杨文惠</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>151.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>大数据背景下传统媒体与新媒体融合发展战略</td>\n",
       "      <td>黄猛猛; 黄瑶</td>\n",
       "      <td>西部广播电视</td>\n",
       "      <td>2021-02-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>大数据时代高校突发事件网络舆情引导机制研究</td>\n",
       "      <td>陈娟; 康秀平; 许莹莹</td>\n",
       "      <td>声屏世界</td>\n",
       "      <td>2021-02-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>48.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>大数据背景下的农业农村新闻创作导向分析</td>\n",
       "      <td>唐雪莲</td>\n",
       "      <td>声屏世界</td>\n",
       "      <td>2021-02-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>大数据技术对新闻传播领域的影响分析</td>\n",
       "      <td>李朝敏</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-01-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>173.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                 篇名  \\\n",
       "0            1                                 农业大数据在农业经济管理中的应用分析   \n",
       "1            2                              大数据、新媒体技术在广电中的应用和前景简析   \n",
       "2            3                从技术与社会的互动中看当前新媒体技术的发展——以5G和人工智能技术为例   \n",
       "3            4                  大数据时代大学思政教学改革实践及其反思——以抖音新媒体介入教学为例   \n",
       "4            5                                   大数据背景下传统媒体突围策略分析   \n",
       "5            6                                   大数据时代思想政治教育“微”透视   \n",
       "6            7                          十八大以来马克思主义新媒体关注度的大数据分析及启示   \n",
       "7            8                                  探讨大数据下的广电新闻编辑发展创新   \n",
       "8            9                                  县市级大数据网络融媒体平台应用研究   \n",
       "9           10                                 大数据与新媒体时代医学期刊的办刊之路   \n",
       "10          11                                 基于AI技术的传统媒体经营与管理创新   \n",
       "11          12                                     大数据时代与校园文化的多元性   \n",
       "12          13             “人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例   \n",
       "13          14                            大数据环境下高校突发事件舆情传播与舆论引导研究   \n",
       "14          15                  创新传媒教育模式 培养现代传媒人才——评《大数据时代传媒教育研究》   \n",
       "15          16                               基于人工智能技术的新媒体交互艺术表达设计   \n",
       "16          17              新媒体时代人工智能作品的著作权法地位探讨——评《著作权法前沿热点问题探究》   \n",
       "17          18                                   播音主持应对人工智能的策略与思考   \n",
       "18          19                                   大数据背景下媒体融合发展趋势探讨   \n",
       "19          20                                5G新媒体平台大数据系统运维体系的建设   \n",
       "20          21                                 基于大数据的电视节目评价体系构建研究   \n",
       "21          22                                 浅析大数据时代传统媒体与新媒体的融合   \n",
       "22          23                         人工智能与新媒体传播双重视域下高校美育实践的改革创新   \n",
       "23          24                               探析大数据下传统媒体与新媒体融合发展路径   \n",
       "24          25                 大数据环境下读者阅读行为转变与可视化分析——基于Cite Space   \n",
       "25          26  Research on Enterprise Human Resource Manageme...   \n",
       "26          27                              大数据背景下高校图书馆服务创新与发展的研究   \n",
       "27          28                                  大数据时代高校党建工作创新路径研究   \n",
       "28          29                      大数据视域下新媒体环境中地方高校档案管理和公共服务能力建设   \n",
       "29          30                                  大数据时代新媒介视觉艺术现状与价值   \n",
       "30          31                             互联网新媒体时代下大数据营销中的伦理建设探析   \n",
       "31          32  Analysis of The Impact of New Media Tools on C...   \n",
       "32          33                                        人工智能主播的应用策略   \n",
       "33          34                 智能导播助力2021春晚新媒体节目创新——浅析人工智能切换技术的应用   \n",
       "34          35                                  大数据时代下全媒体营销路径创新研究   \n",
       "35          36  New Media User Behaviour Research Based on Big...   \n",
       "36          37                               新媒体大数据下大学生党建与思政互构性研究   \n",
       "37          38                               大数据时代背景下高校思想政治教育创新研究   \n",
       "38          39                   新媒体背景下专业教材出版思路研究——基于人工智能专业教材出版实践   \n",
       "39          40                             AI和大数据技术对新媒体传播的影响及应用分析   \n",
       "40          41        大数据时代英语翻译教学新模式的建构——评《大数据时代云端翻转课堂模式下的口译教学探索》   \n",
       "41          42          以大数据推进全媒体时代高校思想政治教育——评《新媒体时代高校思想政治教育模式探究》   \n",
       "42          43                          基于大数据的消费文化语境下自媒体剽窃等侵权现象研究   \n",
       "43          44                                   大数据时代英语媒体的发展战略探析   \n",
       "44          45                                    基于人工智能的传媒企业发展探析   \n",
       "45          46                               大数据背景下传统媒体与新媒体融合路径探析   \n",
       "46          47                               大数据背景下传统媒体与新媒体融合发展战略   \n",
       "47          48                              大数据时代高校突发事件网络舆情引导机制研究   \n",
       "48          49                                大数据背景下的农业农村新闻创作导向分析   \n",
       "49          50                                  大数据技术对新闻传播领域的影响分析   \n",
       "\n",
       "                  作者                                                 刊名  \\\n",
       "0                李文萍                                               农家参谋   \n",
       "1                成连港                                             广播电视信息   \n",
       "2                李颖章                                             中国传媒科技   \n",
       "3            张耀天; 朱薇                                           齐鲁师范学院学报   \n",
       "4                 于佳                                               中国报业   \n",
       "5                柳海燕                                           中学政治教学参考   \n",
       "6                 张平                                              新媒体研究   \n",
       "7                陈媛媛                                               记者摇篮   \n",
       "8            李晓磊; 赵宇                                            广播与电视技术   \n",
       "9   焦骞; 刘卓; 董军杰; 张爱净                                               传媒论坛   \n",
       "10               蔡文丰                                             无线互联科技   \n",
       "11               翟屿潼                                                学理论   \n",
       "12                巩晗                                               青年记者   \n",
       "13           王昌文; 刘静                                               甘肃科技   \n",
       "14      郭慧; 陈红梅; 阎瑞华                                           山西财经大学学报   \n",
       "15               许洋洋                                           自动化技术与应用   \n",
       "16                杜菁                                              新闻爱好者   \n",
       "17               胡未央                                               中国报业   \n",
       "18               侯玉娟                                             广播电视信息   \n",
       "19               芦丽丽                                             现代电视技术   \n",
       "20           宋凯; 庞雪芮                                    湖南工业大学学报(社会科学版)   \n",
       "21               薛锦瑜                                               记者摇篮   \n",
       "22            张建; 高尚                                           绵阳师范学院学报   \n",
       "23                林珺                                               传媒论坛   \n",
       "24                孔洁                                               兰台世界   \n",
       "25         Wang Fang              Journal of Physics: Conference Series   \n",
       "26      朱茂富; 孙琳; 高国瑞                                           内蒙古科技与经济   \n",
       "27            黄璞; 李岩                                              办公室业务   \n",
       "28               郭晓文                                      赤峰学院学报(自然科学版)   \n",
       "29               涂玉洁                                               中国报业   \n",
       "30          吕颖迪; 于孟晨                                               商业文化   \n",
       "31          Xin Zhou  International Journal of Education and Teachin...   \n",
       "32          王梦颖; 李怀苍                                           宁夏师范学院学报   \n",
       "33                陈戈                                             现代电视技术   \n",
       "34               庞体慧                                             中国高新科技   \n",
       "35       Zhu Zhixuan              Journal of Physics: Conference Series   \n",
       "36           张正; 杨会朴                                               文化产业   \n",
       "37            陈坤; 李佳                                           思想政治教育研究   \n",
       "38     祝智敏; 李晓雨; 吴振宇                                             中国传媒科技   \n",
       "39               田新梅                                             中国有线电视   \n",
       "40               王大维                                             中国科技论文   \n",
       "41                王谦                                             中国科技论文   \n",
       "42                陆璐                                             滁州学院学报   \n",
       "43                张敏                                             新闻研究导刊   \n",
       "44                孙芳                                               传媒论坛   \n",
       "45               杨文惠                                               传媒论坛   \n",
       "46           黄猛猛; 黄瑶                                             西部广播电视   \n",
       "47      陈娟; 康秀平; 许莹莹                                               声屏世界   \n",
       "48               唐雪莲                                               声屏世界   \n",
       "49               李朝敏                                               传媒论坛   \n",
       "\n",
       "                发表时间   被引     下载   操作  \n",
       "0         2021-06-25  NaN    NaN   下载  \n",
       "1         2021-06-15  NaN   42.0   下载  \n",
       "2         2021-06-15  NaN    NaN   下载  \n",
       "3         2021-06-01  NaN   39.0   下载  \n",
       "4         2021-05-25  NaN  113.0   下载  \n",
       "5         2021-05-25  NaN  204.0   下载  \n",
       "6         2021-05-25  NaN    NaN   下载  \n",
       "7         2021-05-15  NaN   15.0   下载  \n",
       "8         2021-05-15  NaN    7.0   下载  \n",
       "9         2021-05-10  NaN   50.0   下载  \n",
       "10        2021-05-10  NaN    NaN   下载  \n",
       "11        2021-05-05  NaN  131.0   下载  \n",
       "12        2021-04-30  NaN  139.0   下载  \n",
       "13        2021-04-30  NaN   35.0   下载  \n",
       "14  2021-04-26 10:35  NaN  106.0   下载  \n",
       "15        2021-04-25  NaN  149.0   下载  \n",
       "16        2021-04-20  NaN  247.0   下载  \n",
       "17        2021-04-15  NaN   53.0   下载  \n",
       "18        2021-04-15  NaN  124.0   下载  \n",
       "19        2021-04-15  NaN   31.0   下载  \n",
       "20        2021-04-15  NaN   32.0   下载  \n",
       "21        2021-04-15  NaN  177.0   下载  \n",
       "22        2021-04-15  NaN  109.0   下载  \n",
       "23        2021-04-08  NaN  161.0   下载  \n",
       "24        2021-04-06  NaN   59.0   下载  \n",
       "25        2021-04-01  NaN    NaN  NaN  \n",
       "26        2021-03-31  NaN   33.0   下载  \n",
       "27        2021-03-25  NaN   87.0   下载  \n",
       "28        2021-03-25  NaN   32.0   下载  \n",
       "29        2021-03-25  NaN   71.0   下载  \n",
       "30        2021-03-25  NaN   84.0   下载  \n",
       "31        2021-03-20  NaN    NaN  NaN  \n",
       "32        2021-03-15  NaN   73.0   下载  \n",
       "33        2021-03-15  NaN   55.0   下载  \n",
       "34        2021-03-10  NaN    4.0   下载  \n",
       "35        2021-03-01  NaN    NaN  NaN  \n",
       "36        2021-02-28  NaN  108.0   下载  \n",
       "37        2021-02-20  NaN  484.0   下载  \n",
       "38        2021-02-15  NaN   35.0   下载  \n",
       "39        2021-02-15  NaN  111.0   下载  \n",
       "40        2021-02-15  NaN  126.0   下载  \n",
       "41        2021-02-15  1.0  157.0   下载  \n",
       "42        2021-02-15  NaN   62.0   下载  \n",
       "43        2021-02-10  NaN   31.0   下载  \n",
       "44        2021-02-07  NaN  139.0   下载  \n",
       "45        2021-02-07  NaN  151.0   下载  \n",
       "46        2021-02-05  NaN   24.0   下载  \n",
       "47        2021-02-05  NaN   48.0   下载  \n",
       "48        2021-02-05  NaN   10.0   下载  \n",
       "49        2021-01-25  NaN  173.0   下载  "
      ]
     },
     "execution_count": 277,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_id('gridTable')\n",
    "page_html = element.get_attribute('innerHTML')\n",
    "首页主要数据 = pd.read_html(page_html)[0]\n",
    "首页主要数据"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "stretch-consultation",
   "metadata": {},
   "source": [
    "# 可视化数据下载"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "inappropriate-wilson",
   "metadata": {},
   "source": [
    "数据分析需要用到end格式的文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "pediatric-transparency",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将循环时需要点击的元素做成列表\n",
    "piliang_list = []\n",
    "piliang_list.append('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a')\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/label/input\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/i\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/a\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/ul/li[9]/a\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "aggregate-inspection",
   "metadata": {},
   "outputs": [],
   "source": [
    "for page in range(0,20):\n",
    "    for i in piliang_list:\n",
    "        element = driver.find_element_by_xpath(i)\n",
    "        element.click()\n",
    "        time.sleep(2)\n",
    "    driver.switch_to.window(driver.window_handles[2])#定位新窗口\n",
    "    time.sleep(5)\n",
    "    driver.find_element_by_xpath('//*[@id=\"litotxt\"]/a').click()#点击导出\n",
    "    time.sleep(5)\n",
    "    driver.close()#关闭新窗口\n",
    "    time.sleep(5)\n",
    "    driver.switch_to.window(driver.window_handles[1])#定位回原窗口\n",
    "    driver.find_element_by_xpath('//*[@id=\"PageNext\"]').click()#点击下一页\n",
    "    time.sleep(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "directed-ideal",
   "metadata": {},
   "source": [
    "# PDF下载"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "id": "bearing-cosmetic",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-278-8c13ac749d4b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 先切换窗口并清除全选\n",
    "driver.switch_to_window(driver.window_handles[1])\n",
    "driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "alive-error",
   "metadata": {},
   "source": [
    "## 封装"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 279,
   "id": "orange-garden",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 封装判断验证码\n",
    "def img_have():\n",
    "    try:\n",
    "        driver.find_element_by_xpath('//*[@id=\"vImg\"]')#验证码图片\n",
    "        return True#为真，即有验证码图片存在\n",
    "    except:\n",
    "        return False#反之为空则没有验证码，返回False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 280,
   "id": "dried-evolution",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 封装判断pdf是否可以下载\n",
    "def pdf_exist():\n",
    "    try:\n",
    "        driver.find_element_by_xpath('//*[@id=\"pdfDown\"]') # 下载\n",
    "        return True # 判断为真，即可下载\n",
    "    except:\n",
    "        return False # #反之判断为空，不可以下载，返回False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "id": "third-edwards",
   "metadata": {},
   "outputs": [],
   "source": [
    "#截图\n",
    "def img():\n",
    "    driver.find_element_by_xpath('//*[@id=\"vImg\"]')\n",
    "    time.sleep(2)\n",
    "    ele = driver.find_element_by_id('vImg')\n",
    "    ele.screenshot(r'C:\\Users\\lenovo\\Desktop\\数据挖掘\\验证码.png')\n",
    "    img=r\"C:\\Users\\lenovo\\Desktop\\数据挖掘\\验证码.png\"\n",
    "    return img"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "id": "matched-technical",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 封装图鉴API\n",
    "def api(uname, pwd, img, typeid):\n",
    "    with open(r'C:\\Users\\Ctwo\\data_out\\screenImg.png', 'rb') as f:    \n",
    "        base64_data = base64.b64encode(f.read())\n",
    "        img = base64_data.decode()\n",
    "    data = {\"username\": \"ctlctl\", \"password\": \"134679\", \"typeid\": \"3\", \"image\": img}\n",
    "    time.sleep(1)\n",
    "    result = json.loads(requests.post(\"http://api.ttshitu.com/predict\", json=data).text)\n",
    "    if result['success']:\n",
    "        return result[\"data\"][\"result\"]\n",
    "    else:\n",
    "        return result[\"message\"]\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "id": "tropical-reaction",
   "metadata": {},
   "outputs": [],
   "source": [
    "#封装输入验证码\n",
    "def input():\n",
    "    verifycode = api(uname, pwd, img, typeid)\n",
    "    driver.find_element_by_xpath('//*[@id=\"vcode\"]').clear()\n",
    "    element = driver.find_element_by_xpath('//*[@id=\"vcode\"]').send_keys(verifycode)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "national-sarah",
   "metadata": {},
   "source": [
    "## 判断"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "id": "cognitive-finish",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 页面切换\n",
    "def switch():\n",
    "    all_window=driver.window_handles\n",
    "    new_page = len(all_window)\n",
    "    driver.switch_to.window(driver.window_handles[new_page-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "id": "specified-claim",
   "metadata": {},
   "outputs": [],
   "source": [
    "#封装正确情况判断\n",
    "def true():\n",
    "    switch()\n",
    "    driver.close()\n",
    "    driver.switch_to.window(driver.window_handles[2])\n",
    "    time.sleep(1)\n",
    "    switch()\n",
    "    driver.close()\n",
    "    driver.switch_to.window(driver.window_handles[1])\n",
    "    time.sleep(1)\n",
    "    switch()\n",
    "    driver.close()\n",
    "    driver.switch_to.window(driver.window_handles[1])\n",
    "    time.sleep(1)   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "id": "excessive-movement",
   "metadata": {},
   "outputs": [],
   "source": [
    "## #封装检查下载文件数\n",
    "def file1():\n",
    "    path = r'C:\\Users\\Ctwo\\data_out\\pdf文章'\n",
    "    file = int(len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))]))#通过len和int将验证码输入前的文件夹里的文件数量进行检测，获取数量为未输入验证码的文件量，需调用os模块\n",
    "    return file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "id": "empirical-crowd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def file2():\n",
    "    path = r'C:\\Users\\Ctwo\\data_out\\pdf文章'\n",
    "    file2 = int(len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))]))#通过len和int将验证码输入后的文件夹里的文件数量进行检测，获取数量为输入验证码后文件夹里的文件量，需调用os模块\n",
    "    return file2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "id": "legal-first",
   "metadata": {},
   "outputs": [],
   "source": [
    "#判断有无下载\n",
    "def file3():\n",
    "    if file2 > file:#如果输入验证码后的文件夹里文件数量增加，也就是数量大于输入前文件数量，则代表下载成功，验证码输入正确，返回True\n",
    "        return True\n",
    "    else:\n",
    "        return False # 反之则失败，输入错误，没有下载，文件数量没有大于前一次，返回False。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 403,
   "id": "present-issue",
   "metadata": {},
   "outputs": [],
   "source": [
    "#返回列表页 报错的时候重新运行时方便用\n",
    "driver.switch_to.window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "nonprofit-macintosh",
   "metadata": {},
   "source": [
    "## 实现爬取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 404,
   "id": "unlimited-telephone",
   "metadata": {},
   "outputs": [],
   "source": [
    "for page in range(1,3): #先进行两页循环，可自行选择爬取页数\n",
    "    for link in range(1,50):#可选择从一页中的第几篇文章开始挖取到第几篇文章结束挖取\n",
    "        pdf_xpath = '//*[@id=\"gridTable\"]/table/tbody/tr[{}]/td[2]/a'.format(link)\n",
    "        driver.find_element_by_xpath(pdf_xpath).click()#点击文章进入文章详情页\n",
    "        time.sleep(0)\n",
    "        driver.switch_to.window(driver.window_handles[2])#定位新窗口\n",
    "        pdf_exist()#判断是否弹出验证码窗口\n",
    "        s = pdf_exist()\n",
    "        if s is True:#如果有下载按钮\n",
    "            driver.find_element_by_xpath('//*[@id=\"pdfDown\"]').click()#点击下载\n",
    "        time.sleep(2)\n",
    "        switch()#切换到最新页面\n",
    "        time.sleep(2)\n",
    "        img_have()#判断是否弹出验证码窗口\n",
    "        abc = img_have()\n",
    "        if abc is True:#如果有验证码窗口\n",
    "            time.sleep(2)\n",
    "            api(uname, pwd, img(), typeid)#调用图鉴api识别验证码内容\n",
    "            input()#填写识别出来的验证码内容\n",
    "            file1()#检测下载文件夹内的文件数量\n",
    "            file = file1()\n",
    "            time.sleep(2)\n",
    "            driver.find_element_by_xpath('/html/body/div/form/dl/dd/button').click()#点击验证码提交\n",
    "            file2()#检测点击下载后文件夹内的文件数量\n",
    "            file2 = file2()\n",
    "            time.sleep(2)\n",
    "            file3()#判断下载文件夹内的文件数量是否变化\n",
    "            abc2 = file3()\n",
    "            while abc2 == False:#如果数量没有增加，则返回False，判断等于False，则重新进行验证码爬取识别以及输入并再次检测\n",
    "                api(uname, pwd, img(), typeid)#调用图鉴api识别验证码内容\n",
    "                input()#填写识别出来的验证码内容\n",
    "                file1()#检测下载文件夹内的文件数量\n",
    "                file = file1()\n",
    "                driver.find_element_by_xpath('/html/body/div/form/dl/dd/button').click()#点击验证码提交\n",
    "                file2()#检测点击下载后文件夹内的文件数量\n",
    "                file2 = file2()\n",
    "                file3()#判断下载文件夹内的文件数量是否变化\n",
    "                abc2 = file3()\n",
    "                if abc2 == True:#如果判断等于True，则代表输入正确下载成功\n",
    "                    break\n",
    "            true()#下载成功，情况正确，关闭窗口，返回原窗口，进行下一篇文章PDF原文文件下载   \n",
    "        else:#没有验证码窗口或按钮，则直接下载原文文件，后循环下载之后的pdf文章\n",
    "            switch()\n",
    "            driver.close()\n",
    "            driver.switch_to.window(driver.window_handles[1])\n",
    "            time.sleep(2)\n",
    "    driver.find_element_by_xpath('//*[@id=\"PageNext\"]').click()#一页循环挖取结束，点击进入下一页\n",
    "    time.sleep(1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "284.444px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
